import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import matplotlib.pyplot as mtp # for data visualization
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline
data = pd.read_csv(r"C:\Users\laxma\Downloads\Mall_Customers.csv")
data.head()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 0 | 1 | Male | 19 | 15 | 39 |
| 1 | 2 | Male | 21 | 15 | 81 |
| 2 | 3 | Female | 20 | 16 | 6 |
| 3 | 4 | Female | 23 | 16 | 77 |
| 4 | 5 | Female | 31 | 17 | 40 |
data.tail()
| CustomerID | Gender | Age | Annual Income (k$) | Spending Score (1-100) | |
|---|---|---|---|---|---|
| 195 | 196 | Female | 35 | 120 | 79 |
| 196 | 197 | Female | 45 | 126 | 28 |
| 197 | 198 | Male | 32 | 126 | 74 |
| 198 | 199 | Male | 32 | 137 | 18 |
| 199 | 200 | Male | 30 | 137 | 83 |
data.duplicated().sum()
0
data.columns
Index(['CustomerID', 'Gender', 'Age', 'Annual Income (k$)',
'Spending Score (1-100)'],
dtype='object')
#VISUALIZATION
plt.bar(data['Age'],data['CustomerID'])
plt.xticks(rotation=90)
plt.show()
fig=px.bar(data,x='Annual Income (k$)',y='CustomerID',color='Annual Income (k$)')
fig.show()
plt.scatter(data['Age'],data['Spending Score (1-100)'],color='cyan')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
sns.countplot(x='Spending Score (1-100)', data=data, color='b')
plt.xticks(rotation=90)
plt.show()
plt.figure(figsize=(10,4))
top_car = data['Annual Income (k$)'].value_counts().nlargest(10)
sns.countplot(y=data['Annual Income (k$)'], order=top_car.index, color='red')
<AxesSubplot:xlabel='count', ylabel='Annual Income (k$)'>
sns.lineplot(x='Age', y='Annual Income (k$)', data=data)
<AxesSubplot:xlabel='Age', ylabel='Annual Income (k$)'>
sns.barplot(data['CustomerID'],data['Gender'],color='r')
plt.xticks(rotation=90)
plt.show()
D:\anaconda files\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variables as keyword args: x, y. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='CustomerID', y='Spending Score (1-100)')
plt.xlabel('CustomerID')
plt.ylabel('Spending Score (1-100)')
plt.show()
sns.displot(data["Gender"])
<seaborn.axisgrid.FacetGrid at 0x17dcab63640>
sns.boxplot(x='Annual Income (k$)',y='Spending Score (1-100)',data=data)
plt.xticks(rotation=90)
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63]),
[Text(0, 0, '15'),
Text(1, 0, '16'),
Text(2, 0, '17'),
Text(3, 0, '18'),
Text(4, 0, '19'),
Text(5, 0, '20'),
Text(6, 0, '21'),
Text(7, 0, '23'),
Text(8, 0, '24'),
Text(9, 0, '25'),
Text(10, 0, '28'),
Text(11, 0, '29'),
Text(12, 0, '30'),
Text(13, 0, '33'),
Text(14, 0, '34'),
Text(15, 0, '37'),
Text(16, 0, '38'),
Text(17, 0, '39'),
Text(18, 0, '40'),
Text(19, 0, '42'),
Text(20, 0, '43'),
Text(21, 0, '44'),
Text(22, 0, '46'),
Text(23, 0, '47'),
Text(24, 0, '48'),
Text(25, 0, '49'),
Text(26, 0, '50'),
Text(27, 0, '54'),
Text(28, 0, '57'),
Text(29, 0, '58'),
Text(30, 0, '59'),
Text(31, 0, '60'),
Text(32, 0, '61'),
Text(33, 0, '62'),
Text(34, 0, '63'),
Text(35, 0, '64'),
Text(36, 0, '65'),
Text(37, 0, '67'),
Text(38, 0, '69'),
Text(39, 0, '70'),
Text(40, 0, '71'),
Text(41, 0, '72'),
Text(42, 0, '73'),
Text(43, 0, '74'),
Text(44, 0, '75'),
Text(45, 0, '76'),
Text(46, 0, '77'),
Text(47, 0, '78'),
Text(48, 0, '79'),
Text(49, 0, '81'),
Text(50, 0, '85'),
Text(51, 0, '86'),
Text(52, 0, '87'),
Text(53, 0, '88'),
Text(54, 0, '93'),
Text(55, 0, '97'),
Text(56, 0, '98'),
Text(57, 0, '99'),
Text(58, 0, '101'),
Text(59, 0, '103'),
Text(60, 0, '113'),
Text(61, 0, '120'),
Text(62, 0, '126'),
Text(63, 0, '137')])
sns.violinplot(x='Gender',y='Spending Score (1-100)',data=data)
<AxesSubplot:xlabel='Gender', ylabel='Spending Score (1-100)'>
#MODEL BUILDING
x = data.iloc[:,[3,4]].values
import scipy.cluster.hierarchy as shc
dendro = shc.dendrogram(shc.linkage(x, method='ward'))
mtp.title('Dendrogram Plot')
mtp.ylabel('Euclidean Distance')
mtp.xlabel('Customer')
mtp.show()
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters=5, affinity='euclidean',linkage='ward')
y_pred=hc.fit_predict(x)
y_pred
array([4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3,
4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 3, 4, 1,
4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 2, 0, 2, 0, 2,
1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2,
0, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2,
0, 2], dtype=int64)
mtp.scatter(x[y_pred == 0, 0], x[y_pred == 0,1], s = 100, c = 'blue', label = 'Cluster 1')
mtp.scatter(x[y_pred == 1, 0], x[y_pred == 1,1], s = 100, c = 'green', label = 'Cluster 2')
mtp.scatter(x[y_pred== 2, 0], x[y_pred == 2,1], s = 100, c = 'red', label = 'Cluster 3')
mtp.scatter(x[y_pred == 3, 0], x[y_pred == 3,1], s = 100, c = 'cyan', label = 'Cluster 4')
mtp.scatter(x[y_pred == 4, 0], x[y_pred == 4,1], s = 100, c = 'magenta', label = 'Cluster 5')
mtp.title('cluster of customer')
mtp.xlabel('Annual Income(k$)')
mtp.legend()
mtp.show()